load packages

library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)

load coded data

coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/tds_artifact_coded_volumes.csv")

load stripe detection data

fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/"
filePattern = "tds_stripes_.*.csv"
  
file_list = list.files(fileDir, pattern = filePattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("stripes")){
    temp = read.csv(paste0(fileDir,file))
    stripes = data.frame(temp) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.csv(paste0(fileDir,file))
    temp_dataset = data.frame(temp_dataset) %>% 
      rename("volume" = t,
             "subjectID" = subject) %>%
      select(-file)
    stripes = rbind(stripes, temp_dataset)
    rm(temp_dataset)
  }
}

load global intensities and rps

# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/rp_txt/'
outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
study = "tds2"
rpPattern = "^rp_([0-9]{3})_(.*).txt"
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")

# global intensities
intensities = read.csv(paste0(outputDir,study,'_globalIntensities.csv'))

# edit volume numbers for subject 157, stop3
intensities = intensities %>% 
  mutate(volume = ifelse(subjectID == 157 & run == "stop3" & volume > 43, volume - 1, volume))

# rp files
file_list = list.files(rpDir, pattern = rpPattern)

for (file in file_list){
  # if the merged dataset doesn't exist, create it
  if (!exists("rp")){
    temp = read.table(paste0(rpDir,file))
    colnames(temp) = rpCols
    rp = data.frame(temp, file = rep(file,count(temp))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern) %>%
      mutate(subjectID = as.integer(subjectID))
    rm(temp)
  }
  
  # if the merged dataset does exist, append to it
  else {
    temp_dataset = read.table(paste0(rpDir,file))
    colnames(temp_dataset) = rpCols
    temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>% 
      mutate(volume = row_number()) %>%
      extract(file,c("subjectID","run"), rpPattern) %>%
      mutate(subjectID = as.integer(subjectID))
    rp = rbind(rp, temp_dataset)
    rm(temp_dataset)
  }
}

join dataframes

joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
  left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
  left_join(., rp, by = c("subjectID", "run", "volume")) %>%
  mutate(striping = ifelse(is.na(striping), 0, striping),
         intensity = ifelse(is.na(intensity), 0, intensity),
         tile = paste0("tile_",tile),
         artifact = ifelse(striping > 1, 1, 0),
         confidence = ifelse(striping == 1, "maybe", "sure"),
         confidence = as.factor(confidence)) %>%
  group_by(subjectID, run, tile) %>% 
  mutate(Diff.mean = volMean - lag(volMean),
         Diff.sd = volSD - lag(volSD)) %>%
  spread(tile, freqtile_power)

load models

setwd("~/Documents/code/dsnlab/automotion-test-set")
model_log = readRDS("model_log_FP.rds")
model_svm = readRDS("model_svm_FP.rds")

split the data

set.seed(101) 
sample = sample.split(joined$artifact, SplitRatio = .75)
training = subset(joined, sample == TRUE)
testing = subset(joined, sample == FALSE)

machine learning

tidy data

ml = joined %>%
  group_by(subjectID, run) %>%
  mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
         Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
  gather(tile,freqtile_power, starts_with("tile")) %>%
  mutate(tile = paste0(tile,"_c")) %>%
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  select(-fsl.volume, -striping, -intensity, -confidence, -trash.rp, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
  select(subjectID, run, volume, artifact, everything())

logistic regression

x = as.matrix(ml[,-c(1,2,3,4)])
y = as.double(as.matrix(ml[, 4]))
pred = predict(model_log, newx = x, s=model_log$lambda.1se, type="response")
pred[pred > .05] = 1
pred[pred < .05] = 0

svm

full_svm = ml  %>%
  mutate(artifact = ifelse(artifact == 1, "yes","no"),
         artifact = as.factor(artifact))

# re-run svm on full sample
full_pred = predict(model_svm, newdata = full_svm, type="prob") %>%
  select(-no)
full_pred =  as.matrix(full_pred)
full_pred[full_pred > .09] = "yes"
full_pred[full_pred < .09] = "no"

auto-motion process

man = joined %>%
  gather(tile, freqtile_power, starts_with("tile")) %>%
  filter(tile %in% c("tile_1", "tile_10")) %>%
  
  # code trash based on mean, sd, and rp
  ungroup %>%
  mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
         sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
         meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
         sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
         
         # code volumes above mean thresholds as trash
         upper.mean = meanDiff.mean + 2*sdDiff.mean,
         lower.mean = meanDiff.mean - 2*sdDiff.mean,
         trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
         trash.mean = ifelse(is.na(Diff.mean),0,trash.mean),
         
         upper.sd = meanDiff.sd + 2*sdDiff.sd,
         lower.sd = meanDiff.sd - 2*sdDiff.sd,
         trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
         trash.sd = ifelse(is.na(Diff.sd),0,trash.sd),
         
         # code volumes with more than +/- .25mm translation or rotation in Euclidian distance
         trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
         trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
  select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
  
  # code trash based on striping
  group_by(subjectID, run, tile) %>%
  mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
  ungroup() %>%
  select(-freqtile_power) %>%
  spread(tile,freqtile_power_c) %>%
  mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
  
  # combine trash
  mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
         trash.sum = trash.rp.tr + trash.rp.rot + trash.mean + trash.sd + trash.stripe,
         trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%

  # recode as trash if volume behind and in front are both marked as trash
  mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
         
  # code first volume as trash if second volume is trash
  mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%

  # code hits
  mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
                ifelse(trash.combined == 0 & (artifact == 1), "neg",
                ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits)) %>%
  gather(tile, freqtile_power_c, c("tile_1", "tile_10"))

confusion matrices

logistic regression

confusionMatrix(pred, y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7419   43
##          1  124  116
##                                           
##                Accuracy : 0.9783          
##                  95% CI : (0.9748, 0.9815)
##     No Information Rate : 0.9794          
##     P-Value [Acc > NIR] : 0.7543          
##                                           
##                   Kappa : 0.5708          
##  Mcnemar's Test P-Value : 0.0000000005994 
##                                           
##             Sensitivity : 0.9836          
##             Specificity : 0.7296          
##          Pos Pred Value : 0.9942          
##          Neg Pred Value : 0.4833          
##              Prevalence : 0.9794          
##          Detection Rate : 0.9633          
##    Detection Prevalence : 0.9688          
##       Balanced Accuracy : 0.8566          
##                                           
##        'Positive' Class : 0               
## 

svm

confusionMatrix(full_pred, full_svm$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7462   55
##        yes   81  104
##                                           
##                Accuracy : 0.9823          
##                  95% CI : (0.9791, 0.9852)
##     No Information Rate : 0.9794          
##     P-Value [Acc > NIR] : 0.03331         
##                                           
##                   Kappa : 0.5957          
##  Mcnemar's Test P-Value : 0.03205         
##                                           
##             Sensitivity : 0.9893          
##             Specificity : 0.6541          
##          Pos Pred Value : 0.9927          
##          Neg Pred Value : 0.5622          
##              Prevalence : 0.9794          
##          Detection Rate : 0.9688          
##    Detection Prevalence : 0.9760          
##       Balanced Accuracy : 0.8217          
##                                           
##        'Positive' Class : no              
## 

manual

man1 = man %>%
  filter(tile == "tile_1")
confusionMatrix(man1$trash.combined, man1$artifact)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7481   30
##          1   62  129
##                                           
##                Accuracy : 0.9881          
##                  95% CI : (0.9854, 0.9904)
##     No Information Rate : 0.9794          
##     P-Value [Acc > NIR] : 0.000000004137  
##                                           
##                   Kappa : 0.7311          
##  Mcnemar's Test P-Value : 0.001229        
##                                           
##             Sensitivity : 0.9918          
##             Specificity : 0.8113          
##          Pos Pred Value : 0.9960          
##          Neg Pred Value : 0.6754          
##              Prevalence : 0.9794          
##          Detection Rate : 0.9713          
##    Detection Prevalence : 0.9752          
##       Balanced Accuracy : 0.9016          
##                                           
##        'Positive' Class : 0               
## 

plot and compare

join and plot models

# logistic regression
data.plot.log = bind_cols(ml, as.data.frame(y), as.data.frame(pred)) %>%
  mutate(hits = ifelse(y == 1 & `1` == 1, "hit",
                ifelse(y == 0 & `1` == 1, "pos",
                ifelse(y == 1 & `1` == 0, "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

# svm
data.plot.svm = bind_cols(full_svm, as.data.frame(full_pred)) %>%
  rename("full_pred" = yes) %>%
  mutate(hits = ifelse(artifact == "yes" & full_pred == "yes", "hit",
                ifelse(artifact == "no" & full_pred == "yes", "pos",
                ifelse(artifact == "yes" & full_pred == "no", "neg", NA))),
         label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         hits = as.factor(hits))

# plot and compare models
plot.comp = man %>%
  filter(tile == "tile_1") %>%
  select(subjectID, run, volume, euclidian_trans_deriv, hits, confidence, intensity) %>%
  rename("auto" = hits) %>%
  left_join(data.plot.log, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, hits, confidence, intensity) %>%
  rename("log" = hits) %>%
  left_join(data.plot.svm, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
  select(subjectID, run, volume, euclidian_trans_deriv, auto, log, hits, confidence, intensity) %>%
  rename("svm" = hits) %>%
  gather(model, hits, c("auto", "log", "svm")) %>%
  mutate(label = ifelse(regexpr('.*', hits), as.character(volume), ''),
         intensity = as.factor(intensity))

ggplot(plot.comp, aes(x = volume, y = euclidian_trans_deriv)) +
  geom_line(size = .25) +
  geom_point(data = subset(plot.comp, !is.na(hits)), aes(color = hits, alpha = confidence, shape = intensity), size = 2.5) +
  geom_text(aes(label = label), size = 1.5) +
  facet_wrap(~ subjectID + run + model, ncol = 9, scales = "free") +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
  scale_alpha_manual(values = c(.25, 1)) +
  scale_shape_manual(values = c(16, 17, 15), labels = c("no", "maybe", "yes")) +
  theme(axis.text.x = element_text(size = 6))